#!/usr/bin/env python
# *************************************** #
# This file predicts food spending 
import sys;sys.path.insert(0, '~/git_repo');sys.path.insert(0,'/usr/local/lib/python2.7/dist-packages');
sys.path.remove('~/.local/lib/python2.7/site-packages');

from py_modules.category.cleaner_MG import cleaner
import operator
import pickle
import random
from time import time

import numpy as np
from scipy.stats import randint as sp_randint
import math
import pandas as pd
import pydot
from scipy.sparse import hstack
from sklearn import tree
from sklearn.externals import joblib
from sklearn.externals.six import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.grid_search import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib as mpl
from sklearn.feature_selection import SelectKBest, chi2
mpl.use('Agg')
import matplotlib.pyplot as plt

#DEFINE ALL THE FUNCTIONS
# function loads in the data and transform it from transactions level to user level
def load_process_data(inputfile, outputfile, small=False):
    print '*********************************************************************************'
    print '*************STEP1: LOAD TX DATA AND PROCESS DOWN TO INDV DATA ******************'
    print '*********************************************************************************'
    # load in file
    data = pd.read_csv(inputfile, header=0, delimiter="\t", quoting=3)
    if small == True:
        data = data.head(2000000)

    # Remove null rows
    print 'The raw loaded data is of shape: {}'.format(data.shape)
    data.dropna(inplace=True)
    print 'Dropping missing data is of shape: {}'.format(data.shape)
    data.reset_index(drop=True, inplace=True)
    n_tx = len(data)

    #remove x and digits
    data['transaction_description'].replace('x|[0-9]', '', inplace=True, regex=True)

    #remove non alpha characters 
    data['transaction_description'].replace('[^a-zA-Z\s]', '', inplace=True, regex=True)

    #remove problematic words
    data['clean_tx_string']=data['transaction_description'].apply(cleaner.transactionstring_to_wordlist)

    #drop original words
    data.drop(['transaction_description','account_type'], axis=1, inplace=True)

    # save down for later processing
    pickle.dump(data, open(outputfile, 'wb'))

    n_indv = len(data)
    print 'Loaded and processed data from {:,d} transactions and {:,d} individuals'.format(n_tx, n_indv)

# function to create and train the testing set
def create_train_test(inputfile, train_fraction):
    print '*********************************************************************************'
    print '*************STEP2: CREATE THE TESTING AND TRAINING SET *************************'
    print '*********************************************************************************'
    #load data
    data = pickle.load(open(inputfile, "rb"))

    # splitting up the data into training and testing set based on fraction provided
    print 'Shuffling {:,d} individuals and splitting them into training ({}) and test sets'.format(len(data),
                                                                                                   train_fraction)
    random.seed(1234)
    #randomly choose row numbers from the dataset [rows represent individuals]
    rows = random.sample(data.index, int(train_fraction * len(data)))
    #the training data is a subset of the random rows while the testing is the complement
    train_data = data.ix[rows]
    test_data = data.drop(rows)

    pd.options.display.float_format = '{:,.0f}'.format
    print 'Summary of training set: \n'
    print train_data.describe()
    print 'Summary of test set: \n'
    print test_data.describe()

    print "\nCreating the bag of words..."
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    # min_df is the cutoff for how many individuals need to have a particular word [there are many jibberish words that only show up for a couple people]
    vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None,
                                 max_df=1.0,
                                 ngram_range=(1,2),
                                 binary=True                                
                                 )
    #creates a sparse matrix that represent dummy vars for each word
    train_words = vectorizer.fit_transform(train_data['clean_tx_string'])
    test_words = vectorizer.transform(test_data['clean_tx_string'])

    # put together the whole (y,X) matrix
    #create list of the column names except for transaction description and age
    col_name=list(train_data.columns.values)
    col_name.remove('is_food')
    col_name.remove('clean_tx_string')

    #appending the age data, BoW features, and non-BoW features
    train_matrix = hstack([np.matrix(train_data['is_food']).T, train_words,
                           np.matrix(train_data[col_name])])
    test_matrix = hstack([np.matrix(test_data['is_food']).T, test_words,
                           np.matrix(test_data[col_name])])

    print "feature matrix (x) dimensions for the training {} and testing {} data".format(train_matrix.shape,test_matrix.shape)

    # Save the vectorizer for future use
    pickle.dump(vectorizer, open('{}/python/vec.p'.format(path),'wb'))

    # save testing and training sets for future use
    pickle.dump(train_matrix, open('{}/python/train.p'.format(path), 'wb'))
    pickle.dump(test_matrix, open('{}/python/test.p'.format(path), 'wb'))
    pickle.dump(col_name, open('{}/python/col_name.p'.format(path), 'wb'))

    print "\n Generated and saved training and testing data"

# function to fit RF the model
def fit_model(max_nodes,bs,n_prune,print_tree = False,params = None):
    print '*********************************************************************************'
    print '************************ STEP3: FIT THE MODEL  **********************************'
    print '*********************************************************************************'
    # load data
    col_name = pickle.load(open('{}/python/col_name.p'.format(path),'rb'))
    vectorizer = pickle.load(open('{}/python/vec.p'.format(path),'rb'))
    train_matrix = pickle.load(open('{}/python/train.p'.format(path),'rb'))

    train_y = train_matrix.tocsc()[:, 0].A.astype(int).ravel()
    train_x = train_matrix.tocsc()[:, 1:].tocoo()

    print '\n The feature matrix is of shape {}'.format(train_x.shape)

#prune BoW features using the chi2 method
    if n_prune!=0:
        print "Extracting {} best features by a chi-squared test".format(n_prune)
        # configure ch2 model that select only the top k features in terms of raw correlation
        ch2 = SelectKBest(chi2, k=n_prune)
        # save down the number of BoW features so we know which substring to select
        n_words=vectorizer.vocabulary_.__len__()
        train_x_words = ch2.fit_transform(train_x.tocsc()[:,0:n_words-1].tocoo(), train_y.ravel())
        # paste back on the cts vars
        train_x = hstack([train_x_words,train_x.tocsc()[:,n_words:].tocoo()])
        print '\n After feature pruning, the feature matrix is of shape {}'.format(train_x.shape)

    class_names = ['Non-FF', 'FF']


    print "\n Fitting the model..."

    #extract feature names for BoW as well as non-BoW vars
    vocab = vectorizer.get_feature_names()
    if n_prune!=0:
        vocab= [vocab[i] for i in ch2.get_support(indices=True)]

    vocab_str = [str(x.encode('utf-8')) for x in vocab]
    feature_str = vocab_str + col_name

    #prints the tree if the flag is set to true
    if print_tree==True:
        clf = tree.DecisionTreeClassifier(
                max_leaf_nodes=max_nodes,
                # max_depth = 4,
                min_samples_split=20,
                min_samples_leaf=20,
                max_features=None)
        clf = clf.fit(train_x, train_y)

        dot_data = StringIO()
        tree.export_graphviz(clf, out_file=dot_data,
                             feature_names=feature_str,
                             filled=True, rounded=True, special_characters=True,
                             class_names=class_names
                             # proportion=True
                             )
        graph = pydot.graph_from_dot_data(dot_data.getvalue())
        graph.write_pdf("pdf/tree.pdf")

        print '\n Tree printed'

    #set the model parameters
    clf = RandomForestClassifier(n_estimators=bs,
                                 n_jobs=-1,
                                 max_leaf_nodes=max_nodes,
                                 max_features=None,
                                 random_state=1234,
                                  min_samples_split=20,
                                min_samples_leaf=20,
                             verbose=True)
    start = time()
    clf.fit(train_x,train_y.ravel())
    print time() - start

    #show the top 30 key words
    dictionary = dict(zip(feature_str,clf.feature_importances_))
    sorted_x = sorted(dictionary.items(), key=operator.itemgetter(1),reverse=True)
    for x in range(30):
        print sorted_x[x]

    #Save down model (save as pickle as joblib may be more efficient but creates way too many files)
    pickle.dump(clf,open( '{}/python/clf.p'.format(path), "wb" ) )
    if n_prune!=0:
        pickle.dump(ch2,open( '{}/python/ch2.p'.format(path), "wb" ) )

# function to prepare dataset for prediction
def prepare_prediction(inputfile,outputfile):
    print '*********************************************************************************'
    print '*************STEP2: PREPARE THE PREDICTION DATA SET *****************************'
    print '*********************************************************************************'
    #load data
    data = pickle.load(open(inputfile, "rb"))
    vectorizer = pickle.load(open('{}/python/vec.p'.format(path), "rb"))

    predict_words = vectorizer.transform(data['clean_tx_string'])

    # put together the whole (y,X) matrix
    #create list of the column names to append to the BoW matrix
    #these are columns for identification but not used in prediction
    id_col = ['user_id','date']
    #grab all the column names and remove the ID and tx string so we are left with non BoW predictors
    col_name=list(data.columns.values)
    col_name = [ item for i,item in enumerate(col_name) if item not in id_col ]
    col_name.remove('clean_tx_string')
    #add back in transaction amount so we have it in the list of values to output
    id_col.append('transaction_amount')

    #appending BoW features, and non-BoW features
    predict_matrix = hstack([predict_words,
                           np.matrix(data[col_name])])
    #first part is the ID features and second is the matrix needed for the prediction
    predict_object = [data[id_col],predict_matrix]

    print "feature matrix (x) dimensions for the prediction {} data".format(predict_matrix.shape)

    # save prediction data set
    pickle.dump(predict_object, open(outputfile, 'wb'))

    print "\n Prepared the prediction data"

# function to predict categories in the tx data and save it to csv file
def predict_tx(n_prune,inputfile,outputfile):
    print '*********************************************************************************'
    print '*************STEP3: PREDICT AND OUTPUT DATA ******* *****************************'
    print '*********************************************************************************'
    #load all the objects
    clf = pickle.load(open('{}/python/clf.p'.format(path), "rb"))
    vectorizer = pickle.load(open('{}/python/vec.p'.format(path),'rb'))
    predict_object = pickle.load(open(inputfile,'rb'))
    #extract the dataframe structure for eventual output
    output = predict_object[0]
    #extract the X matrix used for prediction
    predict_x = predict_object[1]

    #Incorporate pruned BoW features using the chi2 method
    if n_prune!=0:
        ch2 = pickle.load(open('{}/python/ch2.p'.format(path), "rb"))
        n_words=vectorizer.vocabulary_.__len__()
        predict_x_words = ch2.transform(predict_x.tocsc()[:,0:n_words-1].tocoo())
        # paste back on the cts vars
        predict_x = hstack([predict_x_words,predict_x.tocsc()[:,n_words:].tocoo()])

    print '\n The feature matrix is of shape {}'.format(predict_x.shape)

    #predict the categories
    result = clf.predict(predict_x).ravel()

    #add results to the dataframe as type category and rename the categories
    output['cat_type']= result
    
    #aggregate (sum) by user_id, date, cat_type
    grouped = output.groupby(['user_id','date','cat_type'])
    output_agg=grouped.aggregate(np.sum)
    #drop missing values to save space
    output_drop = output_agg.dropna()
    #convert long to wide so we have columns for each cat type
    output_drop.unstack('cat_type').to_csv(outputfile,header=True,tupleize_cols=True)

    print '\n Output data to {}'.format(outputfile)

# 0000000000000000000000000000000000
# EXECUTABLE STARTS HERE
# 0000000000000000000000000000000000
# directory that houses all the data (it's separate from the dropbox because I don't want to sync it)
path = '~/mgelman/data/ML/restaurant'
# STEP1:  LOAD TRANSACTION DATA AND PROCESS IT
inputfile = '{}/raw/raw_data_rest.tsv'.format(path)
outputfile = '{}/python/raw_data_rest.p'.format(path)
start = time()
#load_process_data(inputfile, outputfile, small=False)
print "Loading data took {} minutes".format(round((time() - start) / 60))

# STEP2:  CREATE THE TESTING AND TRAINING SET
# SAVES DOWN THE VECTORIZER AS WELL AS THE TRAINING AND TESTING SETS
train_fraction = 0.9;
create_train_test(outputfile, train_fraction)

#prune parameter
n_prune = 10000

#fit model
start = time()
fit_model(max_nodes=None,bs=32,n_prune=n_prune,print_tree=False)
print "Fitting model took {} minutes".format(round((time() - start) / 60))

#########################
#PREDICTION STARTS HERE
########################
#Declare directories (a little messy becuase we have to iterate over partitions)
temp_outputdir ='/python/predict/';raw_outputname='raw_predict_data';processed_outputname='processed_predict_data_rest'
outputdir='/out/';outputname='rest_type_hat'

#load in data for each partition
for x in xrange(1,10):
    start = time()
    print 'loading and processing partition: ', x
    temp_outputfile=''.join([path,temp_outputdir,raw_outputname,str(x),'.p'])

    #STEP 1: LOAD DATA
    category_ML.load_process_data(temp_inputfile, temp_outputfile, type='predict',trim=0)

    #STEP 2: PROCESS DATA (NEED TO RE-PROCESS EACH TIME BECAUSE THE BAG OF WORDS MATRIX WILL VARY DEPENDING ON PRUNING AND THE EXACT SAMPLE USED)
    python_outputfile=''.join([path,temp_outputdir,processed_outputname,str(x),'.p'])
    prepare_prediction(temp_outputfile,python_outputfile)

    #STEP 3: PREDICT CATEGORIES
    csv_outfile = ''.join([path,outputdir,outputname,str(x),'.csv'])
    predict_tx(n_prune,python_outputfile,csv_outfile)

    print "predicting tx took {} minutes".format(round((time() - start) / 60))